<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - May 08, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>May 08, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">HunyuanCustom: A Multimodal-Driven Architecture for Customized Video Generation</h2>
            <p class="paper-summary">Customized video generation aims to produce videos featuring specific
subjects under flexible user-defined conditions, yet existing methods often
struggle with identity consistency and limited input modalities. In this paper,
we propose HunyuanCustom, a multi-modal customized video generation framework
that emphasizes subject consistency while supporting image, audio, video, and
text conditions. Built upon HunyuanVideo, our model first addresses the
image-text conditioned generation task by introducing a text-image fusion
module based on LLaVA for enhanced multi-modal understanding, along with an
image ID enhancement module that leverages temporal concatenation to reinforce
identity features across frames. To enable audio- and video-conditioned
generation, we further propose modality-specific condition injection
mechanisms: an AudioNet module that achieves hierarchical alignment via spatial
cross-attention, and a video-driven injection module that integrates
latent-compressed conditional video through a patchify-based feature-alignment
network. Extensive experiments on single- and multi-subject scenarios
demonstrate that HunyuanCustom significantly outperforms state-of-the-art open-
and closed-source methods in terms of ID consistency, realism, and text-video
alignment. Moreover, we validate its robustness across downstream tasks,
including audio and video-driven customized video generation. Our results
highlight the effectiveness of multi-modal conditioning and identity-preserving
strategies in advancing controllable video generation. All the code and models
are available at https://hunyuancustom.github.io.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: hunyuancustom is a multimodal video generation framework enhancing identity consistency and supporting image, audio, video, and text conditions, outperforming existing methods in multiple aspects.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: hunyuancustom是一个多模态视频生成框架，它增强了身份一致性，并支持图像、音频、视频和文本条件，在多个方面优于现有方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(10/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(9/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04512v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Teng Hu, Zhentao Yu, Zhengguang Zhou, Sen Liang, Yuan Zhou, Qin Lin, Qinglin Lu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">On Path to Multimodal Generalist: General-Level and General-Bench</h2>
            <p class="paper-summary">The Multimodal Large Language Model (MLLM) is currently experiencing rapid
growth, driven by the advanced capabilities of LLMs. Unlike earlier
specialists, existing MLLMs are evolving towards a Multimodal Generalist
paradigm. Initially limited to understanding multiple modalities, these models
have advanced to not only comprehend but also generate across modalities. Their
capabilities have expanded from coarse-grained to fine-grained multimodal
understanding and from supporting limited modalities to arbitrary ones. While
many benchmarks exist to assess MLLMs, a critical question arises: Can we
simply assume that higher performance across tasks indicates a stronger MLLM
capability, bringing us closer to human-level AI? We argue that the answer is
not as straightforward as it seems. This project introduces General-Level, an
evaluation framework that defines 5-scale levels of MLLM performance and
generality, offering a methodology to compare MLLMs and gauge the progress of
existing systems towards more robust multimodal generalists and, ultimately,
towards AGI. At the core of the framework is the concept of Synergy, which
measures whether models maintain consistent capabilities across comprehension
and generation, and across multiple modalities. To support this evaluation, we
present General-Bench, which encompasses a broader spectrum of skills,
modalities, formats, and capabilities, including over 700 tasks and 325,800
instances. The evaluation results that involve over 100 existing
state-of-the-art MLLMs uncover the capability rankings of generalists,
highlighting the challenges in reaching genuine AI. We expect this project to
pave the way for future research on next-generation multimodal foundation
models, providing a robust infrastructure to accelerate the realization of AGI.
Project page: https://generalist.top/</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces general-level, a 5-scale evaluation framework, and general-bench, a benchmark with over 700 tasks, to assess the progress of multimodal large language models (mllms) toward agi by measuring synergy across modalities and comprehension/generation capabilities. they evaluate 100 existing mllms, revealing capability rankings and challenges in achieving agi.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了 general-level, 一个五尺度评估框架，以及 general-bench，一个包含超过700个任务的基准，旨在通过测量跨模态及理解/生成能力的协同作用，评估多模态大型语言模型 (mllm) 在实现 agi 方面的进展。他们评估了 100 个现有的 mllm，揭示了能力排名以及实现 agi 所面临的挑战。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04620v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Hao Fei, Yuan Zhou, Juncheng Li, Xiangtai Li, Qingshan Xu, Bobo Li, Shengqiong Wu, Yaoting Wang, Junbao Zhou, Jiahao Meng, Qingyu Shi, Zhiyuan Zhou, Liangtao Shi, Minghe Gao, Daoan Zhang, Zhiqi Ge, Weiming Wu, Siliang Tang, Kaihang Pan, Yaobo Ye, Haobo Yuan, Tao Zhang, Tianjie Ju, Zixiang Meng, Shilin Xu, Liyu Jia, Wentao Hu, Meng Luo, Jiebo Luo, Tat-Seng Chua, Shuicheng Yan, Hanwang Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Text2CT: Towards 3D CT Volume Generation from Free-text Descriptions Using Diffusion Model</h2>
            <p class="paper-summary">Generating 3D CT volumes from descriptive free-text inputs presents a
transformative opportunity in diagnostics and research. In this paper, we
introduce Text2CT, a novel approach for synthesizing 3D CT volumes from textual
descriptions using the diffusion model. Unlike previous methods that rely on
fixed-format text input, Text2CT employs a novel prompt formulation that
enables generation from diverse, free-text descriptions. The proposed framework
encodes medical text into latent representations and decodes them into
high-resolution 3D CT scans, effectively bridging the gap between semantic text
inputs and detailed volumetric representations in a unified 3D framework. Our
method demonstrates superior performance in preserving anatomical fidelity and
capturing intricate structures as described in the input text. Extensive
evaluations show that our approach achieves state-of-the-art results, offering
promising potential applications in diagnostics, and data augmentation.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces text2ct, a diffusion model-based approach for generating 3d ct volumes directly from free-text medical descriptions, demonstrating state-of-the-art performance in anatomical fidelity and structural detail.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为text2ct的方法，它使用扩散模型直接从自由文本医学描述生成三维ct体数据，并在解剖保真度和结构细节方面表现出最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04522v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Pengfei Guo, Can Zhao, Dong Yang, Yufan He, Vishwesh Nath, Ziyue Xu, Pedro R. A. S. Bassi, Zongwei Zhou, Benjamin D. Simon, Stephanie Anne Harmon, Baris Turkbey, Daguang Xu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Defining and Quantifying Creative Behavior in Popular Image Generators</h2>
            <p class="paper-summary">Creativity of generative AI models has been a subject of scientific debate in
the last years, without a conclusive answer. In this paper, we study creativity
from a practical perspective and introduce quantitative measures that help the
user to choose a suitable AI model for a given task. We evaluated our measures
on a number of popular image-to-image generation models, and the results of
this suggest that our measures conform to human intuition.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces quantitative measures for assessing the creativity of image generation models and evaluates several popular models, finding the measures align with human perception.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种量化图像生成模型创造力的方法，并评估了几种流行的模型，发现该方法与人类感知相符。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04497v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Aditi Ramaswamy</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Efficient Flow Matching using Latent Variables</h2>
            <p class="paper-summary">Flow matching models have shown great potential in image generation tasks
among probabilistic generative models. Building upon the ideas of continuous
normalizing flows, flow matching models generalize the transport path of the
diffusion models from a simple prior distribution to the data. Most flow
matching models in the literature do not explicitly model the underlying
structure/manifold in the target data when learning the flow from a simple
source distribution like the standard Gaussian. This leads to inefficient
learning, especially for many high-dimensional real-world datasets, which often
reside in a low-dimensional manifold. Existing strategies of incorporating
manifolds, including data with underlying multi-modal distribution, often
require expensive training and hence frequently lead to suboptimal performance.
To this end, we present \texttt{Latent-CFM}, which provides simplified
training/inference strategies to incorporate multi-modal data structures using
pretrained deep latent variable models. Through experiments on multi-modal
synthetic data and widely used image benchmark datasets, we show that
\texttt{Latent-CFM} exhibits improved generation quality with significantly
less training ($\sim 50\%$ less in some cases) and computation than
state-of-the-art flow matching models. Using a 2d Darcy flow dataset, we
demonstrate that our approach generates more physically accurate samples than
competitive approaches. In addition, through latent space analysis, we
demonstrate that our approach can be used for conditional image generation
conditioned on latent features.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces latent-cfm, a novel flow matching model that leverages latent variable models to improve generation quality and training efficiency for multi-modal data, achieving state-of-the-art results with significantly less computation.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了latent-cfm，一种新颖的流匹配模型，它利用潜在变量模型来提高多模态数据的生成质量和训练效率，以显著降低的计算量实现了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04486v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Anirban Samaddar, Yixuan Sun, Viktor Nilsson, Sandeep Madireddy</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">RLMiniStyler: Light-weight RL Style Agent for Arbitrary Sequential Neural Style Generation</h2>
            <p class="paper-summary">Arbitrary style transfer aims to apply the style of any given artistic image
to another content image. Still, existing deep learning-based methods often
require significant computational costs to generate diverse stylized results.
Motivated by this, we propose a novel reinforcement learning-based framework
for arbitrary style transfer RLMiniStyler. This framework leverages a unified
reinforcement learning policy to iteratively guide the style transfer process
by exploring and exploiting stylization feedback, generating smooth sequences
of stylized results while achieving model lightweight. Furthermore, we
introduce an uncertainty-aware multi-task learning strategy that automatically
adjusts loss weights to adapt to the content and style balance requirements at
different training stages, thereby accelerating model convergence. Through a
series of experiments across image various resolutions, we have validated the
advantages of RLMiniStyler over other state-of-the-art methods in generating
high-quality, diverse artistic image sequences at a lower cost. Codes are
available at https://github.com/fengxiaoming520/RLMiniStyler.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces rlministyler, a lightweight reinforcement learning framework for arbitrary style transfer that generates diverse, high-quality stylized image sequences at a lower computational cost.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 rlministyler，一种轻量级的强化学习框架，用于任意风格迁移，以较低的计算成本生成多样化、高质量的风格化图像序列。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04424v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jing Hu, Chengming Feng, Shu Hu, Ming-Ching Chang, Xin Li, Xi Wu, Xin Wang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CountDiffusion: Text-to-Image Synthesis with Training-Free Counting-Guidance Diffusion</h2>
            <p class="paper-summary">Stable Diffusion has advanced text-to-image synthesis, but training models to
generate images with accurate object quantity is still difficult due to the
high computational cost and the challenge of teaching models the abstract
concept of quantity. In this paper, we propose CountDiffusion, a training-free
framework aiming at generating images with correct object quantity from textual
descriptions. CountDiffusion consists of two stages. In the first stage, an
intermediate denoising result is generated by the diffusion model to predict
the final synthesized image with one-step denoising, and a counting model is
used to count the number of objects in this image. In the second stage, a
correction module is used to correct the object quantity by changing the
attention map of the object with universal guidance. The proposed
CountDiffusion can be plugged into any diffusion-based text-to-image (T2I)
generation models without further training. Experiment results demonstrate the
superiority of our proposed CountDiffusion, which improves the accurate object
quantity generation ability of T2I models by a large margin.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces countdiffusion, a training-free method for improving object quantity accuracy in text-to-image synthesis using diffusion models by incorporating a counting model and correction module.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了countdiffusion，一种无需训练的方法，通过结合计数模型和校正模块，提高文本到图像合成中目标数量的准确性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04347v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yanyu Li, Pencheng Wan, Liang Han, Yaowei Wang, Liqiang Nie, Min Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Multi-turn Consistent Image Editing</h2>
            <p class="paper-summary">Many real-world applications, such as interactive photo retouching, artistic
content creation, and product design, require flexible and iterative image
editing. However, existing image editing methods primarily focus on achieving
the desired modifications in a single step, which often struggles with
ambiguous user intent, complex transformations, or the need for progressive
refinements. As a result, these methods frequently produce inconsistent
outcomes or fail to meet user expectations. To address these challenges, we
propose a multi-turn image editing framework that enables users to iteratively
refine their edits, progressively achieving more satisfactory results. Our
approach leverages flow matching for accurate image inversion and a
dual-objective Linear Quadratic Regulators (LQR) for stable sampling,
effectively mitigating error accumulation. Additionally, by analyzing the
layer-wise roles of transformers, we introduce a adaptive attention
highlighting method that enhances editability while preserving multi-turn
coherence. Extensive experiments demonstrate that our framework significantly
improves edit success rates and visual fidelity compared to existing methods.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a multi-turn image editing framework that allows users to iteratively refine edits, leveraging flow matching, dual-objective lqr, and adaptive attention highlighting to improve edit success rates and visual fidelity.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一个多轮图像编辑框架，允许用户迭代地改进编辑，利用流动匹配、双目标lqr和自适应注意力突出显示来提高编辑成功率和视觉保真度。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04320v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zijun Zhou, Yingying Deng, Xiangyu He, Weiming Dong, Fan Tang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Person-In-Situ: Scene-Consistent Human Image Insertion with Occlusion-Aware Pose Control</h2>
            <p class="paper-summary">Compositing human figures into scene images has broad applications in areas
such as entertainment and advertising. However, existing methods often cannot
handle occlusion of the inserted person by foreground objects and unnaturally
place the person in the frontmost layer. Moreover, they offer limited control
over the inserted person's pose. To address these challenges, we propose two
methods. Both allow explicit pose control via a 3D body model and leverage
latent diffusion models to synthesize the person at a contextually appropriate
depth, naturally handling occlusions without requiring occlusion masks. The
first is a two-stage approach: the model first learns a depth map of the scene
with the person through supervised learning, and then synthesizes the person
accordingly. The second method learns occlusion implicitly and synthesizes the
person directly from input data without explicit depth supervision.
Quantitative and qualitative evaluations show that both methods outperform
existing approaches by better preserving scene consistency while accurately
reflecting occlusions and user-specified poses.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces two methods for realistically inserting human figures into scene images with occlusion handling and pose control using latent diffusion models, one with explicit depth learning and the other with implicit occlusion learning.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了两种方法，利用潜在扩散模型将人物图像逼真地插入到场景图像中，同时处理遮挡和姿势控制。一种方法采用显式深度学习，另一种方法采用隐式遮挡学习。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04052v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Shun Masuda, Yuki Endo, Yoshihiro Kanamori</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">TerraFusion: Joint Generation of Terrain Geometry and Texture Using Latent Diffusion Models</h2>
            <p class="paper-summary">3D terrain models are essential in fields such as video game development and
film production. Since surface color often correlates with terrain geometry,
capturing this relationship is crucial to achieving realism. However, most
existing methods generate either a heightmap or a texture, without sufficiently
accounting for the inherent correlation. In this paper, we propose a method
that jointly generates terrain heightmaps and textures using a latent diffusion
model. First, we train the model in an unsupervised manner to randomly generate
paired heightmaps and textures. Then, we perform supervised learning of an
external adapter to enable user control via hand-drawn sketches. Experiments
show that our approach allows intuitive terrain generation while preserving the
correlation between heightmaps and textures.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces terrafusion, a latent diffusion model for jointly generating terrain heightmaps and textures, enabling user control through sketch-based input while preserving correlation between geometry and appearance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍 terrafusion，一种潜在扩散模型，用于联合生成地形高度图和纹理，通过基于草图的输入实现用户控制，同时保持几何形状和外观之间的相关性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04050v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Kazuki Higo, Toshiki Kanai, Yuki Endo, Yoshihiro Kanamori</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">VideoPath-LLaVA: Pathology Diagnostic Reasoning Through Video Instruction Tuning</h2>
            <p class="paper-summary">We present VideoPath-LLaVA, the first large multimodal model (LMM) in
computational pathology that integrates three distinct image scenarios, single
patch images, automatically keyframe-extracted clips, and manually segmented
video pathology images, to mimic the natural diagnostic process of
pathologists. By generating detailed histological descriptions and culminating
in a definitive sign-out diagnosis, VideoPath-LLaVA bridges visual narratives
with diagnostic reasoning.
  Central to our approach is the VideoPath-Instruct dataset, comprising 4278
video and diagnosis-specific chain-of-thought instructional pairs sourced from
educational histopathology videos on YouTube. Although high-quality data is
critical for enhancing diagnostic reasoning, its creation is time-intensive and
limited in volume. To overcome this challenge, we transfer knowledge from
existing single-image instruction datasets to train on weakly annotated,
keyframe-extracted clips, followed by fine-tuning on manually segmented videos.
VideoPath-LLaVA establishes a new benchmark in pathology video analysis and
offers a promising foundation for future AI systems that support clinical
decision-making through integrated visual and diagnostic reasoning. Our code,
data, and model are publicly available at
https://github.com/trinhvg/VideoPath-LLaVA.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: videopath-llava is a new large multimodal model for pathology diagnostic reasoning, trained on a novel dataset of histopathology videos and images, and offers a benchmark for ai in clinical decision-making.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: videopath-llava 是一个新的大型多模态模型，用于病理诊断推理，它使用一个新颖的组织病理学视频和图像数据集进行训练，并为临床决策中ai的应用提供了一个基准。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04192v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Trinh T. L. Vuong, Jin Tae Kwak</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MAISY: Motion-Aware Image SYnthesis for MedicalImage Motion Correction</h2>
            <p class="paper-summary">Patient motion during medical image acquisition causes blurring, ghosting,
and distorts organs, which makes image interpretation challenging.Current
state-of-the-art algorithms using Generative Adversarial Network (GAN)-based
methods with their ability to learn the mappings between corrupted images and
their ground truth via Structural Similarity Index Measure (SSIM) loss
effectively generate motion-free images. However, we identified the following
limitations: (i) they mainly focus on global structural characteristics and
therefore overlook localized features that often carry critical pathological
information, and (ii) the SSIM loss function struggles to handle images with
varying pixel intensities, luminance factors, and variance. In this study, we
propose Motion-Aware Image SYnthesis (MAISY) which initially characterize
motion and then uses it for correction by: (a) leveraging the foundation model
Segment Anything Model (SAM), to dynamically learn spatial patterns along
anatomical boundaries where motion artifacts are most pronounced and, (b)
introducing the Variance-Selective SSIM (VS-SSIM) loss which adaptively
emphasizes spatial regions with high pixel variance to preserve essential
anatomical details during artifact correction. Experiments on chest and head CT
datasets demonstrate that our model outperformed the state-of-the-art
counterparts, with Peak Signal-to-Noise Ratio (PSNR) increasing by 40%, SSIM by
10%, and Dice by 16%.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces maisy, a gan-based method for medical image motion correction that leverages sam for spatial pattern learning and a variance-selective ssim loss for preserving anatomical details, outperforming existing methods on ct datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为maisy 的基于 gan 的医学图像运动校正方法，该方法利用 sam 进行空间模式学习，并使用方差选择性 ssim 损失来保留解剖细节，在 ct 数据集上优于现有方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04105v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Andrew Zhang, Hao Wang, Shuchang Ye, Michael Fulham, Jinman Kim</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Deep residual learning with product units</h2>
            <p class="paper-summary">We propose a deep product-unit residual neural network (PURe) that integrates
product units into residual blocks to improve the expressiveness and parameter
efficiency of deep convolutional networks. Unlike standard summation neurons,
product units enable multiplicative feature interactions, potentially offering
a more powerful representation of complex patterns. PURe replaces conventional
convolutional layers with 2D product units in the second layer of each residual
block, eliminating nonlinear activation functions to preserve structural
information. We validate PURe on three benchmark datasets. On Galaxy10 DECaLS,
PURe34 achieves the highest test accuracy of 84.89%, surpassing the much deeper
ResNet152, while converging nearly five times faster and demonstrating strong
robustness to Poisson noise. On ImageNet, PURe architectures outperform
standard ResNet models at similar depths, with PURe34 achieving a top-1
accuracy of 80.27% and top-5 accuracy of 95.78%, surpassing deeper ResNet
variants (ResNet50, ResNet101) while utilizing significantly fewer parameters
and computational resources. On CIFAR-10, PURe consistently outperforms ResNet
variants across varying depths, with PURe272 reaching 95.01% test accuracy,
comparable to ResNet1001 but at less than half the model size. These results
demonstrate that PURe achieves a favorable balance between accuracy,
efficiency, and robustness. Compared to traditional residual networks, PURe not
only achieves competitive classification performance with faster convergence
and fewer parameters, but also demonstrates greater robustness to noise. Its
effectiveness across diverse datasets highlights the potential of
product-unit-based architectures for scalable and reliable deep learning in
computer vision.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces pure, a deep residual network integrating product units for improved expressiveness and efficiency, validated on image classification benchmarks with promising results regarding accuracy, speed, and robustness.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了pure，一种集成乘积单元的深度残差网络，旨在提高表达性和效率。在图像分类基准测试中验证了其在准确性、速度和鲁棒性方面的良好结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04397v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ziyuan Li, Uwe Jaekel, Babette Dellen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.65, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">A Weak Supervision Learning Approach Towards an Equitable Parking Lot Occupancy Estimation</h2>
            <p class="paper-summary">The scarcity and high cost of labeled high-resolution imagery have long
challenged remote sensing applications, particularly in low-income regions
where high-resolution data are scarce. In this study, we propose a weak
supervision framework that estimates parking lot occupancy using 3m resolution
satellite imagery. By leveraging coarse temporal labels -- based on the
assumption that parking lots of major supermarkets and hardware stores in
Germany are typically full on Saturdays and empty on Sundays -- we train a
pairwise comparison model that achieves an AUC of 0.92 on large parking lots.
The proposed approach minimizes the reliance on expensive high-resolution
images and holds promise for scalable urban mobility analysis. Moreover, the
method can be adapted to assess transit patterns and resource allocation in
vulnerable communities, providing a data-driven basis to improve the well-being
of those most in need.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper uses weak supervision (temporal assumptions about parking lot occupancy) to estimate parking lot occupancy from 3m satellite imagery, addressing data scarcity in remote sensing particularly relevant to low-income regions.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文利用弱监督（关于停车场占用情况的时间假设）从3米卫星图像中估计停车场占用率，解决了遥感数据稀缺问题，尤其与低收入地区相关。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(1/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(3/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.04229v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Theophilus Aidoo, Till Koebe, Akansh Maurya, Hewan Shrestha, Ingmar Weber</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-08 04:32:02 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>